export CUDA_VISIBLE_DEVICES=4

# 256*256 bz 1 30GB
# 512*512 bz 1 35GB
# 3072*1024 bz 1 39GB
# 2024*1024 bz 1 32GB

# accelerate launch \
# --config_file accelerate_config.yaml \
python train_flux_kontext/train_flux_kontext_restore_depth.py \
--jsonl_for_train data_restore_depth_150.json \
--cache_dir /mnt/nas/shengjie/cache/ \
--resolution_height 1024 \
--resolution_width 1024 \
--output_dir /mnt/nas/shengjie/depth_restore_output_0829/ \
--logging_dir logs \
--mixed_precision bf16 \
--pretrained_model_name_or_path /data/models/FLUX.1-Kontext-dev \
--train_batch_size 1 \
--gradient_accumulation_steps 4 \
--dataloader_num_workers 4 \
--max_train_steps 1000 \
--checkpointing_steps 100 \
--rank 128 \
--gradient_checkpointing \
--learning_rate 2e-5 \
--lr_scheduler cosine \
--lr_warmup_steps 50
# --offload 
# --quant_transformers